from dotenv import load_dotenv
load_dotenv("dev.env")
import os
db_name = os.getenv("db_name")
db_username = os.getenv("db_username")
db_password = os.getenv("db_password")
db_host = os.getenv("db_host")
db_port = os.getenv("db_port")
website = os.getenv("website")
city_link = os.getenv("city_link")
chrome_path = os.getenv("chrome_path")
selenium_chrome_driver_path = os.getenv("selenium_chrome_driver_path")
First, we are loading environment variables, this is not required for non-professional projects but it is a good practice to hide your private variables that contains database credentials which might make your server vulnerable.
import psycopg2
import time
import math
import numpy as np
import os.path
import pandas as pd
import re
import pycld2
import nltk
import joblib
import warnings
import urllib.request
import io
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import keras
import sys
import sweetviz as sv
import webbrowser
import gc
import matplotlib.image as mpimg
from datetime import date, timedelta, datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn import mixture
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.model_selection import GridSearchCV
from PIL import Image
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD, RMSprop
from keras.utils import np_utils
from sklearn import metrics
from sklearn.utils import shuffle
from wordcloud import WordCloud
#from IPython.display import display, Image
nltk.download('stopwords', quiet=True)
stemmer = TurkishStemmer()
conn_string = 'host={pghost} port={pgport} dbname={pgdatabase} user={pguser} password={pgpassword}'.format(pgdatabase=db_name,pguser=db_username,pgpassword=db_password,pghost=db_host,pgport=db_port)
conn=psycopg2.connect(conn_string)
cur=conn.cursor()
options = Options()
options.binary_location = chrome_path
warnings.filterwarnings('ignore')
def check_if_table_exists(schema,table):
cur.execute("select exists(select * from information_schema.tables where table_schema='{schema}' AND table_name='{table}')".format(schema=schema, table=table))
return cur.fetchone()[0]
def check_if_index_exists(index):
cur.execute("SELECT EXISTS(SELECT * FROM PG_CLASS WHERE relname = '{index}')".format(index=index))
return cur.fetchone()[0]
def check_if_file_exists(filename):
return os.path.isfile(filename)
def execute_mogrify(conn, df, schema, table):
tuples = [tuple(x) for x in df.to_numpy()]
cols = '"'+'","'.join(list(df.columns))+'"'
cursor = conn.cursor()
try:
for tup in tuples:
query = """INSERT INTO "{schema}"."{table}"({cols}) VALUES ({values}) ON CONFLICT DO NOTHING""".format(schema=schema,table=table, cols=cols, values=",".join(map(str,tup)))
cursor.execute(query)
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print("Error: %s" % error)
conn.rollback()
cursor.close()
return 1
cursor.close()
def df_column_conversation(df, column_name, type):
if(type == 'timestamp'):
df[column_name] = df[column_name].apply(lambda x: f"'{x}'::timestamp")
if(type == 'text'):
df[column_name] = df[column_name].str.replace("'","").apply(lambda x: f"'{x}'")
if(type == 'date'):
df[column_name] = df[column_name].apply(lambda x: f"'{x}'::date")
if(type == 'numeric'):
df[column_name] = df[column_name].apply(str).str.replace(',','.')
if(type == 'integer'):
df[column_name] = df[column_name].apply(str).str.replace(',','.').apply(float).astype('Int64').apply(str)
def clean_text(text):
text = text.lower()
text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", text)
return text
def webpage_scroll_down(driver, scroll_speed):
total_height = int(driver.execute_script("return document.body.scrollHeight"))
for i in range(1, total_height, scroll_speed):
driver.execute_script("window.scrollTo(0, {});".format(i))
def byte_image_to_numpy_array(byte_image, image_size):
return cv2.resize(np.array(Image.open(io.BytesIO(byte_image))), (image_size, image_size))
def analyze_data(outputHTML,upToDateCheck,schema,table):
if not((check_if_file_exists(outputHTML) and upToDateCheck)):
sql_command = """
SELECT *
FROM "{schema}"."{table}";
""".format(schema=schema, table=table)
df = pd.read_sql(sql_command,conn)
eda = sv.analyze(df)
eda.show_html(outputHTML, open_browser=False)
del df
gc.collect()
#eda_sales.show_notebook()
webbrowser.open(outputHTML,new=2)
print('You can see the exploratory data analysis visualizations in your browser.')
We are defining our functions and importing libraries which will be used in the next steps.
if(check_if_table_exists('ODS','EXT_FB_RESTAURANT')):
print('Table ODS.EXT_FB_RESTAURANT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_RESTAURANT"
(
"RESTAURANT_ID" text NOT NULL,
"RESTAURANT_NAME" text,
"RESTAURANT_LINK" text,
"DATE" date,
CONSTRAINT "RESTAURANT_ID" UNIQUE ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_RESTAURANT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_MENU')):
print('Table ODS.EXT_FB_MENU already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_MENU"
(
"PRODUCT_ID" text NOT NULL,
"RESTAURANT_ID" text,
"CATEGORY_NAME" text,
"PRODUCT_NAME" text,
"PRODUCT_DESCRIPTION" text,
"PRODUCT_LISTED_PRICE" text,
"PRODUCT_PRICE" text,
"DISCOUNT" boolean,
"DESIGN_TYPE" text,
"DATE" date,
CONSTRAINT "PRODUCT_ID" UNIQUE ("PRODUCT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_MENU created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_COMMENT')):
print('Table ODS.EXT_FB_COMMENT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_COMMENT"
(
"RESTAURANT_ID" text,
"USERNAME" text,
"COMMENT_TEXT" text,
"COMMENT_DATE" text,
"SPEED" text,
"SERVING" text,
"FLAVOUR" text,
"DATE" date,
CONSTRAINT "UNIQUE_COMMENTS" UNIQUE ("RESTAURANT_ID", "USERNAME", "COMMENT_TEXT")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_COMMENT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_PRODUCT_IMAGE')):
print('Table ODS.EXT_FB_PRODUCT_IMAGE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_PRODUCT_IMAGE"
(
"PRODUCT_ID" text,
"RESTAURANT_ID" text,
"IMAGE_LINK" text,
"DATE" date,
CONSTRAINT "UNIQUE_IMAGES" UNIQUE ("PRODUCT_ID", "RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_PRODUCT_IMAGE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_PRODUCT_IMAGE_SOURCE')):
print('Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE"
(
"IMAGE_LINK" text NOT NULL,
"SOURCE" bytea,
CONSTRAINT "UNIQUE_SOURCE" PRIMARY KEY ("IMAGE_LINK")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_MENU')):
print('Table EDW.DWH_FB_MENU already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_MENU"
(
"PRODUCT_ID" text NOT NULL,
"RESTAURANT_ID" text,
"CATEGORY_NAME" text,
"PRODUCT_NAME" text,
"PRODUCT_DESCRIPTION" text,
"PRODUCT_LISTED_PRICE" numeric,
"PRODUCT_PRICE" numeric,
"DISCOUNT" boolean,
"DESIGN_TYPE" text,
"DATE" date,
CONSTRAINT "PRODUCT_ID" UNIQUE ("PRODUCT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_MENU created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_COMMENT')):
print('Table EDW.DWH_FB_COMMENT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_COMMENT"
(
"RESTAURANT_ID" text,
"USERNAME" text,
"COMMENT_TEXT" text,
"COMMENT_DATE" date,
"SPEED" integer,
"SERVING" integer,
"FLAVOUR" integer,
"DATE" date,
CONSTRAINT "UNIQUE_COMMENTS" UNIQUE ("RESTAURANT_ID", "USERNAME", "COMMENT_TEXT")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_COMMENT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
cur.execute("""
CREATE OR REPLACE FUNCTION public.try_cast(_in text, INOUT _out anyelement)
LANGUAGE 'plpgsql'
AS $BODY$
BEGIN
EXECUTE format('SELECT %L::%s', $1, pg_typeof(_out))
INTO _out;
EXCEPTION WHEN others THEN
-- do nothing: _out already carries default
END
$BODY$;
""")
cur.execute('COMMIT;')
Table ODS.EXT_FB_RESTAURANT already exists. Table ODS.EXT_FB_MENU already exists. Table ODS.EXT_FB_COMMENT already exists. Table ODS.EXT_FB_PRODUCT_IMAGE already exists. Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE already exists. Table EDW.DWH_FB_MENU already exists. Table EDW.DWH_FB_COMMENT already exists.
Creating tables that will contain our data and creating database functions.
cur.execute("""
WITH DATES AS(
SELECT
MAX("DATE") AS "DATE"
FROM "ODS"."EXT_FB_MENU" EFM
UNION ALL
SELECT
MAX("DATE") AS "DATE"
FROM "ODS"."EXT_FB_COMMENT" EFC
)
SELECT
MAX("DATE") AS "LAST_EXECUTION_DATE"
FROM DATES;
""")
last_execution_date = cur.fetchone()[0]
last_execution_date
datetime.date(2021, 6, 11)
Getting last execution dates in the tables to not import the same data we collected for performance.
restaurant_list = []
end_date = min(date(2021,6,11),(date.today() - timedelta(days=1)))
driver = webdriver.Chrome(options=options, executable_path=selenium_chrome_driver_path)
if(last_execution_date < end_date):
driver.get(city_link)
time.sleep(5)
for i in range(25):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
city_restaurant_groups = driver.find_elements_by_class_name("restaurant-main-info")
for restaurant in city_restaurant_groups:
restaurant_name = restaurant.find_element_by_class_name("restaurant-display-name").text
restaurant_name = restaurant_name.replace("YENİ ", "")
restaurant_link = restaurant.find_element_by_class_name("restaurant-display-name").find_element_by_xpath(".//a").get_attribute('href')
restaurant_id = restaurant_link.split("/")[-1]
if(len(restaurant_link) < 2):
continue
restaurant_list.append([restaurant_id,restaurant_name,restaurant_link])
restaurant_df = pd.DataFrame(restaurant_list, columns=["RESTAURANT_ID","RESTAURANT_NAME","RESTAURANT_LINK"])
df_column_conversation(restaurant_df, 'RESTAURANT_ID', 'text')
df_column_conversation(restaurant_df, 'RESTAURANT_NAME', 'text')
df_column_conversation(restaurant_df, 'RESTAURANT_LINK', 'text')
restaurant_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,restaurant_df,"ODS","EXT_FB_RESTAURANT")
Scraping the website to gather restaurant lists and their links, in this part we need to scroll down to load all the restaurants.
sql_command = """
SELECT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_MENU" EFM WHERE EFR."RESTAURANT_ID" = EFM."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
restaurant_link = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(restaurant_link)
time.sleep(5)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
menu = driver.find_element_by_xpath('//*[@id="restaurant_menu"]')
categories = menu.find_elements_by_xpath('//*[contains(@id,"menu_")]')
menu_list = []
for category in categories:
category_name = category.find_element_by_xpath(".//b").text
for product in category.find_elements_by_xpath(".//div[2]/ul/li"):
try:
design_type = "list"
try:
product_id = product.find_elements_by_class_name("getProductDetail")[-1].get_attribute('data-product-id')
product_name = product.find_elements_by_class_name("getProductDetail")[-1].text
except:
product_id = product.find_element_by_xpath(".//strong").get_attribute('data-product-id')
product_name = product.find_element_by_xpath(".//strong").text
design_type = "card"
try:
product_description = product.find_element_by_class_name("product-desc").text
product_price = product.find_element_by_class_name("price").text
except:
product_description = product.find_element_by_class_name("productInfo").text
product_price = product.find_element_by_class_name("newPrice").text
if(not(design_type=="card")):
design_type = "box"
discount = "TRUE"
try:
if(design_type=="list"):
product_listed_price = product.find_element_by_class_name("listed-price").text
if(design_type in ["card","box"]):
product_listed_price = product.find_element_by_class_name("listedPrice").text
except:
product_listed_price = product_price
discount = "FALSE"
menu_list.append([product_id,sublink,category_name,product_name,product_description,product_listed_price,product_price,discount,design_type])
except:
continue
menu_df = pd.DataFrame(menu_list, columns=["PRODUCT_ID","RESTAURANT_ID","CATEGORY_NAME","PRODUCT_NAME","PRODUCT_DESCRIPTION","PRODUCT_LISTED_PRICE","PRODUCT_PRICE","DISCOUNT","DESIGN_TYPE"])
menu_df = menu_df[menu_df['PRODUCT_ID'].str.len() > 0]
menu_df = menu_df[menu_df['PRODUCT_NAME'].str.len() > 0]
df_column_conversation(menu_df, 'PRODUCT_ID', 'text')
df_column_conversation(menu_df, 'RESTAURANT_ID', 'text')
df_column_conversation(menu_df, 'CATEGORY_NAME', 'text')
df_column_conversation(menu_df, 'PRODUCT_NAME', 'text')
df_column_conversation(menu_df, 'PRODUCT_DESCRIPTION', 'text')
df_column_conversation(menu_df, 'PRODUCT_LISTED_PRICE', 'text')
df_column_conversation(menu_df, 'PRODUCT_PRICE', 'text')
df_column_conversation(menu_df, 'DESIGN_TYPE', 'text')
menu_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,menu_df,"ODS","EXT_FB_MENU")
menu_df
Looping through all of the restaurants and scraping their products.
sql_command = """
SELECT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_COMMENT" EFC WHERE EFR."RESTAURANT_ID" = EFC."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df = restaurant_df.sample(frac=1)
restaurant_df
| RESTAURANT_ID | |
|---|---|
| 35 | boysan-pizza-esenyurt-mehtercesme-mah-istanbul |
| 7 | salon-kalamis-kadikoy-fenerbahce-mah-istanbul |
| 36 | dominos-pizza-sultangazi-habibler-mah-istanbul |
| 51 | al-ye-kadikoy-dumlupinar-mah-istanbul |
| 59 | trend-216-cafe-restaurant-uskudar-mimar-sinan-... |
| ... | ... |
| 17 | dominos-pizza-atasehir-ataturk-mah-girne-cad-i... |
| 24 | afili-pizza-kadikoy-rasimpasa-mah-istanbul |
| 48 | kafein-plus-kucukcekmece-tevfikbey-mah-sefakoy... |
| 57 | oblomov-kadikoy-caferaga-mah-moda-istanbul |
| 32 | apizza-legal-tuzla-orta-mah-istanbul |
79 rows × 1 columns
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
last_comment_page_url = "{website}/{sublink}?section=comments&page=9999".format(website=website,sublink=sublink)
driver.get(last_comment_page_url)
time.sleep(0.1)
comments_list = []
if(sublink not in driver.current_url):
continue
last_comment_page_redirect_url = driver.current_url
last_comment_page_number = int(last_comment_page_redirect_url.replace("&status=closed","").replace("{website}/{sublink}?section=comments&page=".format(website=website,sublink=sublink),""))
for page_number in range(1, last_comment_page_number+1):
current_comment_page_url = "{website}/{sublink}?section=comments&page={page_number}".format(website=website,sublink=sublink,page_number=page_number)
driver.get(current_comment_page_url)
time.sleep(1)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
try:
driver.find_element(By.XPATH, '//*[@id="alternative-restaurant-popup"]/div[1]/div[2]/img').click(); #Closing pop-up
except Exception:
pass
#driver.find_element(By.XPATH, '//*[@id="restaurantDetail"]/div[2]/div[1]/ul/li[4]/a').click(); #Clicking comments
comment_list = driver.find_elements_by_class_name("comments-body")
for comment in comment_list:
try:
username = comment.find_element_by_class_name("userName").text
comment_text = comment.find_element_by_xpath('.//p').text
comment_date = comment.find_element_by_class_name("commentDate").text
except NoSuchElementException:
continue
try:
speed = comment.find_element_by_class_name("speed").text
except NoSuchElementException:
speed = ""
try:
serving = comment.find_element_by_class_name("serving").text
except NoSuchElementException:
serving = ""
try:
flavour = comment.find_element_by_class_name("flavour").text
except NoSuchElementException:
flavour = ""
comments_list.append([sublink, username, comment_text, comment_date, speed, serving, flavour])
comment_df = pd.DataFrame(comments_list, columns=["RESTAURANT_ID","USERNAME","COMMENT_TEXT","COMMENT_DATE","SPEED","SERVING","FLAVOUR"])
df_column_conversation(comment_df, 'RESTAURANT_ID', 'text')
df_column_conversation(comment_df, 'USERNAME', 'text')
df_column_conversation(comment_df, 'COMMENT_TEXT', 'text')
df_column_conversation(comment_df, 'COMMENT_DATE', 'text')
df_column_conversation(comment_df, 'SPEED', 'text')
df_column_conversation(comment_df, 'SERVING', 'text')
df_column_conversation(comment_df, 'FLAVOUR', 'text')
comment_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,comment_df,"ODS","EXT_FB_COMMENT")
comment_df
Looping through all of the restaurants and scraping their reviews and given scores for each comment.
sql_command = """
SELECT DISTINCT
"RESTAURANT_ID",
"DESIGN_TYPE"
FROM "ODS"."EXT_FB_MENU" EFM
WHERE 1=1
AND EFM."DESIGN_TYPE" <> 'list'
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_PRODUCT_IMAGE" EFPI WHERE EFM."RESTAURANT_ID" = EFPI."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df
| RESTAURANT_ID | DESIGN_TYPE | |
|---|---|---|
| 0 | the-boss-pizza-burger-sariyer-maden-mah-istanbul | card |
| 1 | 444-pizza-bagcilar-demirkapi-mah-istanbul | card |
| 2 | mezzaluna-paket-beykoz-acarlar-mah-acarkent-is... | card |
| 3 | kafein-plus-kucukcekmece-tevfikbey-mah-sefakoy... | card |
| 4 | mayk-cafe-sisli-fulya-mah-istanbul | card |
| ... | ... | ... |
| 208 | amigos-burger-pizza-kagithane-merkez-mah-istanbul | card |
| 209 | konak-firin-avcilar-merkez-mah-istanbul | card |
| 210 | keyifle-pide-lahmacun-bakirkoy-atakoy-6-kisim-... | card |
| 211 | havelka-kadikoy-caddebostan-mah-istanbul | card |
| 212 | heros-pizza-esenyurt-fatih-mah-istanbul | card |
213 rows × 2 columns
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
design_type = restaurant_df.loc[i,"DESIGN_TYPE"]
page_url = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(page_url)
time.sleep(0.1)
webpage_scroll_down(driver, 15)
if(sublink not in driver.current_url):
continue
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
try:
driver.find_element(By.XPATH, '//*[@id="alternative-restaurant-popup"]/div[1]/div[2]/img').click(); #Closing pop-up
except Exception:
pass
menu = driver.find_element_by_xpath('//*[@id="restaurant_menu"]')
categories = menu.find_elements_by_xpath('//*[contains(@id,"menu_")]')
image_list = []
for product in menu.find_elements_by_class_name("product-image"):
try:
product_id = product.get_attribute('data-product-id')
image_link = product.find_element_by_xpath(".//img").get_attribute('src')
if("www.yemeksepeti.com/assets/images/" in image_link):
continue
image_list.append([product_id,sublink,image_link])
except:
pass
image_df = pd.DataFrame(image_list, columns=["PRODUCT_ID","RESTAURANT_ID","IMAGE_LINK"])
df_column_conversation(image_df, 'PRODUCT_ID', 'text')
df_column_conversation(image_df, 'RESTAURANT_ID', 'text')
df_column_conversation(image_df, 'IMAGE_LINK', 'text')
image_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,image_df,"ODS","EXT_FB_PRODUCT_IMAGE")
driver.quit()
sql_command = """
SELECT DISTINCT
"IMAGE_LINK"
FROM "ODS"."EXT_FB_PRODUCT_IMAGE" EFPI
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" EFPIS WHERE EFPIS."IMAGE_LINK" = EFPI."IMAGE_LINK");
"""
image_df = pd.read_sql(sql_command,conn)
image_df = image_df.sample(frac=1).reset_index(drop=True)
image_df
| IMAGE_LINK | |
|---|---|
| 0 | https://cdn.yemeksepeti.com/ProductImages/TR_I... |
for i in range(len(image_df)):
page_url = image_df.loc[i,"IMAGE_LINK"]
image_file_name = "images/product.jpg"
try:
urllib.request.urlretrieve(page_url, image_file_name)
except:
continue
image_binary = psycopg2.Binary(open(image_file_name, 'rb').read())
cur.execute("""
INSERT INTO "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" ("IMAGE_LINK","SOURCE")
VALUES('{image_link}',{image_binary})
ON CONFLICT DO NOTHING;
""".format(image_link=page_url, image_binary=image_binary))
cur.execute('COMMIT;')
Looping through all of the restaurants and scraping their product images for deep learning purposes.
cur.execute("""
INSERT INTO "EDW"."DWH_FB_MENU"
SELECT
EFM."PRODUCT_ID",
EFM."RESTAURANT_ID",
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
REPLACE(REPLACE(EFM."PRODUCT_LISTED_PRICE",' TL',''),',','.')::numeric AS "PRODUCT_LISTED_PRICE",
REPLACE(REPLACE(EFM."PRODUCT_PRICE",' TL',''),',','.')::numeric AS "PRODUCT_PRICE",
EFM."DISCOUNT",
EFM."DESIGN_TYPE",
EFM."DATE"
FROM "ODS"."EXT_FB_MENU" EFM
ON CONFLICT DO NOTHING;
""")
cur.execute('COMMIT;')
if(last_execution_date < end_date):
sql_command = """
WITH CLEAN_DATA AS(
SELECT
EFC."RESTAURANT_ID",
EFC."USERNAME",
LOWER(EFC."COMMENT_TEXT") AS "COMMENT_TEXT",
EFC."COMMENT_DATE",
TRY_CAST(REGEXP_REPLACE(EFC."SPEED", '\D','','g'),NULL::INTEGER) AS "SPEED",
TRY_CAST(REGEXP_REPLACE(EFC."SERVING", '\D','','g'),NULL::INTEGER) AS "SERVING",
TRY_CAST(REGEXP_REPLACE(EFC."FLAVOUR", '\D','','g'),NULL::INTEGER) AS "FLAVOUR",
EFC."DATE",
REGEXP_REPLACE(EFC."COMMENT_DATE", '\D','','g')||' '||REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(REPLACE(EFC."COMMENT_DATE",' önce',''), '[^[:alpha:]]', '', 'g'),'ay','month'),'bugün','today'),'gün','day') AS "COMMENT_DATE_INTERVAL"
FROM "ODS"."EXT_FB_COMMENT" EFC
WHERE 1=1
AND EFC."USERNAME" <> 'Yemeksepeti'
)
SELECT
CD."RESTAURANT_ID",
CD."USERNAME",
CD."COMMENT_TEXT",
CASE WHEN CD."COMMENT_DATE_INTERVAL" = ' today' THEN CD."DATE" ELSE CD."DATE" - CAST(CD."COMMENT_DATE_INTERVAL" AS INTERVAL) END::date AS "COMMENT_DATE",
CD."SPEED",
CD."SERVING",
CD."FLAVOUR",
CD."DATE"
FROM CLEAN_DATA CD;
"""
comment_df = pd.read_sql(sql_command,conn)
comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(clean_text)
comment_df
df_column_conversation(comment_df, 'RESTAURANT_ID', 'text')
df_column_conversation(comment_df, 'USERNAME', 'text')
df_column_conversation(comment_df, 'COMMENT_TEXT', 'text')
df_column_conversation(comment_df, 'COMMENT_DATE', 'date')
df_column_conversation(comment_df, 'SPEED', 'integer')
df_column_conversation(comment_df, 'SERVING', 'integer')
df_column_conversation(comment_df, 'FLAVOUR', 'integer')
df_column_conversation(comment_df, 'DATE', 'date')
comment_df.replace('<NA>', 'NULL', inplace=True)
execute_mogrify(conn,comment_df,"EDW","DWH_FB_COMMENT")
sql_command = """
SELECT
*
FROM "EDW"."DWH_FB_COMMENT" EFR;
"""
comment_df = pd.read_sql(sql_command,conn)
comment_df
| RESTAURANT_ID | USERNAME | COMMENT_TEXT | COMMENT_DATE | SPEED | SERVING | FLAVOUR | DATE | |
|---|---|---|---|---|---|---|---|---|
| 0 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...o | herzamanki gibi mükemmel ellerinize saglik | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 1 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | favori pizzacım genelde madness söylüyordum yo... | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 2 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...t | şahane şimdi tekrarlıyorum siparişi | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 3 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | hamur seçimi yapılamıyor | 2021-05-24 | 10.0 | 9 | 7 | 2021-06-09 |
| 4 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...1 | pizza kesilmeden gönderilmişti lütfen dikkat e... | 2021-05-24 | 10.0 | 8 | 10 | 2021-06-09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271838 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | elinize sağlık çok başarılı | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271839 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...i | super | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271840 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...2 | pizzadan anlamayanların beğendiği bir pizza do... | 2021-05-27 | 10.0 | 5 | 5 | 2021-06-09 |
| 271841 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | cok guzeldi bu sefer tesekkurler | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 271842 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | hawaii pizza ve yarı yarıya pizzalar harika in... | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
271843 rows × 8 columns
stop_words = [element for element in stopwords.words('turkish') if element not in ['çok','eğer','gibi','hiç','niçin','niye','sanki','yani','en','az','birkaç','bazı','aslında','neden','hepsi']]
comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in (stop_words)]))
#comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
comment_df
| RESTAURANT_ID | USERNAME | COMMENT_TEXT | COMMENT_DATE | SPEED | SERVING | FLAVOUR | DATE | |
|---|---|---|---|---|---|---|---|---|
| 0 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...o | herzamanki gibi mükemmel ellerinize saglik | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 1 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | favori pizzacım genelde madness söylüyordum yo... | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 2 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...t | şahane şimdi tekrarlıyorum siparişi | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 3 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | hamur seçimi yapılamıyor | 2021-05-24 | 10.0 | 9 | 7 | 2021-06-09 |
| 4 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...1 | pizza kesilmeden gönderilmişti lütfen dikkat e... | 2021-05-24 | 10.0 | 8 | 10 | 2021-06-09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271838 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | elinize sağlık çok başarılı | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271839 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...i | super | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271840 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...2 | pizzadan anlamayanların beğendiği bir pizza do... | 2021-05-27 | 10.0 | 5 | 5 | 2021-06-09 |
| 271841 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | cok guzeldi sefer tesekkurler | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 271842 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | hawaii pizza yarı yarıya pizzalar harika incec... | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
271843 rows × 8 columns
Cleaning our review dataset with lowercasing all the text, removing punctuations, removing some Turkish stopwords, and estimating when the reviews are made, we are also transforming string scores to the integer type.
analyze_data('eda_restaurant.html',False,'ODS','EXT_FB_RESTAURANT')
analyze_data('eda_menu.html',False,'EDW','DWH_FB_MENU')
analyze_data('eda_product_image.html',False,'ODS','EXT_FB_PRODUCT_IMAGE')
analyze_data('eda_comment.html',False,'EDW','DWH_FB_COMMENT')
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_restaurant.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_menu.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_product_image.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_comment.html was generated. You can see the exploratory data analysis visualizations in your browser.
We have 1,189 distinct resturants, 131,131 distinct products with 1,077 distinct categories and 23,109 dstinct names only 11% of the products are discounted.
Our product prices are averaging around 41 with 30.4 standart deviation. We have around 5,112 product images.
In user reviews we have 271,843 comments which 221,312 of them are distinct in user scores most of the user scores are 10 with around 56-64% of the scores and the second most common score is 1 with around 10-13% of the scores.
Dataset does not contain any missing data except in user speed scores which contains less than 1% of the data with 3,495 rows.
if(not(check_if_file_exists('images/word_cloud_popular_words.png'))):
sql_command = """
WITH WORDS AS(
SELECT
regexp_split_to_table(regexp_replace("COMMENT_TEXT", '[^[:alpha:]]', '', 'g'), '\s+(\w\s+)*') AS "WORD"
FROM "EDW"."DWH_FB_COMMENT"
)
SELECT
"WORD",
COUNT(*) AS "WORD_COUNT"
FROM WORDS
WHERE LENGTH("WORD") > 1
GROUP BY WORDS."WORD"
ORDER BY COUNT(*) DESC
LIMIT 1000;
"""
df_word_cloud = pd.read_sql(sql_command,conn)
word_cloud_data = dict(zip(df_word_cloud['WORD'].tolist(), df_word_cloud['WORD_COUNT'].tolist()))
wc = WordCloud(background_color='white', width=8000, height=4000, max_words=10000).generate_from_frequencies(word_cloud_data)
wc.to_file('images/word_cloud_popular_words.png')
Image.open('images/word_cloud_popular_words.png')
We are going to transform our reviews with CountVectorizer, process is similar to one-hot encoding which counts how many of these words appear in a label.
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
#print(np.shape(labels))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_GaussianNB_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GaussianNB()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 33.73% F1 Score for SPEED: 33.73% Precision Score for SPEED: 33.73% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 29.20% F1 Score for SERVING: 29.20% Precision Score for SERVING: 29.20% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 28.80% F1 Score for FLAVOUR: 28.80% Precision Score for FLAVOUR: 28.80%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_SVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = SVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 71.87% F1 Score for SPEED: 71.87% Precision Score for SPEED: 71.87% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 67.30% F1 Score for SERVING: 67.30% Precision Score for SERVING: 67.30% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 64.90% F1 Score for FLAVOUR: 64.90% Precision Score for FLAVOUR: 64.90%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_LinearSVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearSVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 68.13% F1 Score for SPEED: 68.13% Precision Score for SPEED: 68.13% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 64.20% F1 Score for SERVING: 64.20% Precision Score for SERVING: 64.20% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 63.60% F1 Score for FLAVOUR: 63.60% Precision Score for FLAVOUR: 63.60%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_DecisionTreeClassifier_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'max_depth': [10,13,14,15],
'min_samples_split': [2,3,4]
}
gscv = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 72.20% F1 Score for SPEED: 72.20% Precision Score for SPEED: 72.20% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 66.83% F1 Score for SERVING: 66.83% Precision Score for SERVING: 66.83% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 64.87% F1 Score for FLAVOUR: 64.87% Precision Score for FLAVOUR: 64.87%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_LogisticRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LogisticRegression(penalty='l2', C=1.2, n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 70.90% F1 Score for SPEED: 70.90% Precision Score for SPEED: 70.90% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 67.33% F1 Score for SERVING: 67.33% Precision Score for SERVING: 67.33% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 65.73% F1 Score for FLAVOUR: 65.73% Precision Score for FLAVOUR: 65.73%
We are going to transform our reviews with TF-IDF Vectorizer, process is similar to one-hot encoding which calculates how frequently these words appear in a label.
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
#print(np.shape(labels))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_GaussianNB_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GaussianNB()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 34.10% F1 Score for SPEED: 34.10% Precision Score for SPEED: 34.10% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 29.37% F1 Score for SERVING: 29.37% Precision Score for SERVING: 29.37% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 29.10% F1 Score for FLAVOUR: 29.10% Precision Score for FLAVOUR: 29.10%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_SVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'C': [1.5],
#'C': [1.4,1.5,1.6],
'kernel': ['rbf']
#'kernel': ['linear','rbf','sigmoid']
}
gscv = GridSearchCV(SVC(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 72.87% F1 Score for SPEED: 72.87% Precision Score for SPEED: 72.87% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 68.50% F1 Score for SERVING: 68.50% Precision Score for SERVING: 68.50% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 66.83% F1 Score for FLAVOUR: 66.83% Precision Score for FLAVOUR: 66.83%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_LinearSVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearSVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 70.63% F1 Score for SPEED: 70.63% Precision Score for SPEED: 70.63% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 66.77% F1 Score for SERVING: 66.77% Precision Score for SERVING: 66.77% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 66.30% F1 Score for FLAVOUR: 66.30% Precision Score for FLAVOUR: 66.30%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_DecisionTreeClassifier_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'max_depth': [9,10,11],
'min_samples_split': [2,3]
}
gscv = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 71.56% F1 Score for SPEED: 71.56% Precision Score for SPEED: 71.56% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 66.70% F1 Score for SERVING: 66.70% Precision Score for SERVING: 66.70% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 64.44% F1 Score for FLAVOUR: 64.44% Precision Score for FLAVOUR: 64.44%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_LogisticRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LogisticRegression(penalty='l2', C=1.2, n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 72.64% F1 Score for SPEED: 72.64% Precision Score for SPEED: 72.64% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 68.61% F1 Score for SERVING: 68.61% Precision Score for SERVING: 68.61% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 66.63% F1 Score for FLAVOUR: 66.63% Precision Score for FLAVOUR: 66.63%
sql_command = """
WITH EFM AS(
SELECT DISTINCT
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
EFM."PRODUCT_LISTED_PRICE",
EFM."PRODUCT_PRICE",
EFM."DISCOUNT"
FROM "EDW"."DWH_FB_MENU" EFM
WHERE 1=1
AND CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END = 1
AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%soslar%' AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%yan %'
),
QUARTILES AS(
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q1,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q2,
PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q3
FROM EFM
),
OUTLIER_RANGES AS(
SELECT
QR.Q1,
QR.Q2,
QR.Q3,
QR.Q3-QR.Q1 AS IQR,
QR.Q1-(1.5*(QR.Q3-QR.Q1)) AS MINIMA,
QR.Q3+(1.5*(QR.Q3-QR.Q1)) AS MAXIMA
FROM QUARTILES QR
),
PIZZAS AS(
SELECT *
FROM EFM
INNER JOIN OUTLIER_RANGES ORA ON (1=1)
WHERE EFM."PRODUCT_LISTED_PRICE" BETWEEN ORA.MINIMA AND ORA.MAXIMA
ORDER BY EFM."PRODUCT_LISTED_PRICE"
)
SELECT
EFM.*,
PIZ.Q1,
PIZ.Q2,
PIZ.Q3,
PIZ.IQR,
PIZ.MINIMA,
PIZ.MAXIMA
FROM "EDW"."DWH_FB_MENU" EFM
INNER JOIN PIZZAS PIZ ON (EFM."CATEGORY_NAME" = PIZ."CATEGORY_NAME" AND EFM."PRODUCT_NAME" = PIZ."PRODUCT_NAME" AND EFM."PRODUCT_DESCRIPTION" = PIZ."PRODUCT_DESCRIPTION" AND EFM."PRODUCT_LISTED_PRICE" = PIZ."PRODUCT_LISTED_PRICE");
"""
menu_df = pd.read_sql(sql_command,conn)
menu_df.head()
| PRODUCT_ID | RESTAURANT_ID | CATEGORY_NAME | PRODUCT_NAME | PRODUCT_DESCRIPTION | PRODUCT_LISTED_PRICE | PRODUCT_PRICE | DISCOUNT | DESIGN_TYPE | DATE | q1 | q2 | q3 | iqr | minima | maxima | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4F9A1222-2174-4794-85BE-78B864137B58 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Vodafone Menüleri | Vodafone Menüsü (Pizza Refanzine (32 cm.)) | Pizza Refanzine (32 cm.) + Coca-Cola (33 cl.) ... | 52.0 | 52.0 | False | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 1 | 8b503b63-b162-4b43-a58c-301a41c6b98a | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Rucola (25 cm.) | Dante sos, mozzarella peyniri, roka, cherry do... | 66.0 | 33.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 2 | 15cff837-c4f5-4ded-b09e-dd8c6fd5e591 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Funghi (25 cm.) | Dante sos, mozzarella peyniri, kültür mantarı | 58.0 | 29.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 3 | ef08f896-366b-40b0-9858-6cdc69401f41 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Quattro Formaggi (25 cm.) | Dante sos, mozzarella peyniri, parmesan peynir... | 76.0 | 38.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 4 | e7e2b046-6d7b-4636-bcd7-59be32110798 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Mediterraneo (25 cm.) | Dante sos, mozzarella peyniri, beyaz peynir, y... | 66.0 | 33.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
sql_command = """
WITH EFM AS(
SELECT DISTINCT
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
EFM."PRODUCT_LISTED_PRICE",
EFM."PRODUCT_PRICE",
EFM."DISCOUNT"
FROM "EDW"."DWH_FB_MENU" EFM
WHERE 1=1
AND CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END = 1
AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%soslar%' AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%yan %'
),
QUARTILES AS(
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q1,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q2,
PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q3
FROM EFM
),
OUTLIER_RANGES AS(
SELECT
QR.Q1,
QR.Q2,
QR.Q3,
QR.Q3-QR.Q1 AS IQR,
QR.Q1-(1.5*(QR.Q3-QR.Q1)) AS MINIMA,
QR.Q3+(1.5*(QR.Q3-QR.Q1)) AS MAXIMA
FROM QUARTILES QR
),
PIZZAS AS(
SELECT *
FROM EFM
INNER JOIN OUTLIER_RANGES ORA ON (1=1)
WHERE EFM."PRODUCT_LISTED_PRICE" BETWEEN ORA.MINIMA AND ORA.MAXIMA
ORDER BY EFM."PRODUCT_LISTED_PRICE"
)
SELECT
EFM."RESTAURANT_ID",
AVG(EFM."PRODUCT_LISTED_PRICE") AS "AVERAGE_PRODUCT_LISTED_PRICE",
COUNT(*) AS "NUMBER_OF_PRODUCTS"
FROM "EDW"."DWH_FB_MENU" EFM
INNER JOIN PIZZAS PIZ ON (EFM."CATEGORY_NAME" = PIZ."CATEGORY_NAME" AND EFM."PRODUCT_NAME" = PIZ."PRODUCT_NAME" AND EFM."PRODUCT_DESCRIPTION" = PIZ."PRODUCT_DESCRIPTION" AND EFM."PRODUCT_LISTED_PRICE" = PIZ."PRODUCT_LISTED_PRICE")
GROUP BY EFM."RESTAURANT_ID";
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df.head()
| RESTAURANT_ID | AVERAGE_PRODUCT_LISTED_PRICE | NUMBER_OF_PRODUCTS | |
|---|---|---|---|
| 0 | 1991-tarihi-ortakoy-kumpircisi-silivri-yeni-ma... | 41.600000 | 10 |
| 1 | 400-c-pizza-sisli-tesvikiye-mah-istanbul | 62.042553 | 47 |
| 2 | 400-c-pizza-umraniye-serifali-mah-istanbul | 63.476190 | 42 |
| 3 | 444-pizza-bagcilar-demirkapi-mah-istanbul | 43.163793 | 58 |
| 4 | 444-pizza-eyup-emniyettepe-mah-istanbul | 43.818462 | 65 |
if not(check_if_file_exists('images/kmeans_product_cluster.png')):
model = KMeans(n_clusters=5).fit(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
labels = model.predict(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
plt.scatter(menu_df["PRODUCT_LISTED_PRICE"].values,menu_df["PRODUCT_PRICE"].values,c=labels, cmap='viridis');
plt.xlabel("Listed Price")
plt.ylabel("Price with Discount")
plt.title('K-Means Clustering on Products')
plt.savefig('images/kmeans_product_cluster.png')
plt.close()
Image.open('images/kmeans_product_cluster.png')
if not(check_if_file_exists('images/kmeans_restaurant_cluster.png')):
model = KMeans(n_clusters=5).fit(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(restaurant_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,restaurant_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('K-Means Clustering on Restaurants')
plt.savefig('images/kmeans_restaurant_cluster.png')
plt.close()
Image.open('images/kmeans_restaurant_cluster.png')
if not(check_if_file_exists('images/gmm_product_cluster.png')):
model = mixture.GaussianMixture(n_components=5).fit(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
labels = model.predict(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
plt.scatter(menu_df["PRODUCT_LISTED_PRICE"].values,menu_df["PRODUCT_PRICE"].values,c=labels, cmap='viridis');
plt.xlabel("Listed Price")
plt.ylabel("Price with Discount")
plt.title('GMM Clustering on Products')
plt.savefig('images/gmm_product_cluster.png')
plt.close()
Image.open('images/gmm_product_cluster.png')
if not(check_if_file_exists('images/gmm_restaurant_cluster.png')):
model = mixture.GaussianMixture(n_components=5).fit(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(restaurant_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,restaurant_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('GMM Clustering on Restaurants')
plt.savefig('images/gmm_restaurant_cluster.png')
plt.close()
Image.open('images/gmm_restaurant_cluster.png')
Markov chains calculate how frequently words follow each other and generate new text depending on the first words we are given. These models can be used to create fake reviews.
sql_command = """
WITH COMMENTS AS(
SELECT "RESTAURANT_ID",
STRING_AGG(LOWER("COMMENT_TEXT"), ' ') AS ALL_COMMENT,
COUNT("COMMENT_TEXT") AS COMMENT_COUNT
FROM "ODS"."EXT_FB_COMMENT" EFC
GROUP BY "RESTAURANT_ID"
)
SELECT *
FROM COMMENTS
ORDER BY COMMENT_COUNT DESC;
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df.head()
| RESTAURANT_ID | all_comment | comment_count | |
|---|---|---|---|
| 0 | pizza-bulls-uskudar-altunizade-mah-istanbul | lezzeti eskisi gibi gelmedi bize. fark ödeyip ... | 2293 |
| 1 | pizza-bulls-uskudar-ferah-mah-istanbul | her şey çok güzeldi. elinize sağlık. hizli ve ... | 1764 |
| 2 | pizza-bulls-kartal-soganlik-yeni-mah-istanbul | cok pahali olmasi disinda bir problem yok gibi... | 1743 |
| 3 | pizza-bulls-umraniye-cakmak-mah-istanbul | çok lezzetli güvenle çoçuguma yedirebiliyorum ... | 1582 |
| 4 | pizza-bulls-atasehir-fetih-mah-istanbul | servis ve hız 10 üzerinden 20.. bu siparişi is... | 1277 |
import random
def make_markov_model(cleaned_stories, n_gram=2):
markov_model = {}
for i in range(len(cleaned_stories)-n_gram-1):
curr_state, next_state = "", ""
for j in range(n_gram):
curr_state += cleaned_stories[i+j] + " "
next_state += cleaned_stories[i+j+n_gram] + " "
curr_state = curr_state[:-1]
next_state = next_state[:-1]
if curr_state not in markov_model:
markov_model[curr_state] = {}
markov_model[curr_state][next_state] = 1
else:
if next_state in markov_model[curr_state]:
markov_model[curr_state][next_state] += 1
else:
markov_model[curr_state][next_state] = 1
# calculating transition probabilities
for curr_state, transition in markov_model.items():
total = sum(transition.values())
for state, count in transition.items():
markov_model[curr_state][state] = count/total
return markov_model
def generate_story(markov_model, limit=100, start='my god'):
n = 0
curr_state = start
next_state = None
story = ""
story+=curr_state+" "
while n<limit:
next_state = random.choices(list(markov_model[curr_state].keys()),
list(markov_model[curr_state].values()))
curr_state = next_state[0]
story+=curr_state+" "
n+=1
return story
restaurant_df['clean_comment'] = restaurant_df['all_comment'].apply(clean_text)
text = restaurant_df.loc[0]['clean_comment'].split()
markov_model = make_markov_model(text)
for i in range(20):
print(str(i)+". ", generate_story(markov_model, start="yemek çok", limit=20))
0. yemek çok lezzetliydi ve hızlı ulaştı he zamanki gibi cumali beye ayrıca teşekkür ederiz ilgisine alakasına gayet sıcak geldi atacan beye de teşekkür ediyorum her zamanki gibi sıcak ve lezzetliydi ayrıca ikramınız için teşekkür ederim pizza bulls altunizade olduğu sürece her fırsatta 1. yemek çok lezzetliydi ve getiren arkadasta cok hizli getirdi ve cok nazikti semih beye teşekkürler sıcacık pizzalariniz ve guleryuzlu personelleriniz icin size 10 puan lezzet bilal beye ekstra tesekkur ediyorum pizza dışındaki yan ürünler lezzet açısından sınıfta kaldı pizzanın dilim kesimleri neredeyse 2. yemek çok güzeldi semih beye ilgisinden ve tatli ikramindan dolayi tesekkur ederiz pizza bulls çalışanlarına özellikle duygu hanıma ve restorana ikram için ayrıca çok teşekkürler bilal bey siparişi verdim atakan bey aradı ve memnuniyetimi sordu gayet memnun olduğumu ilettim misafirlerim oldugunu söyledim 3. yemek çok güzeldi semih beye ilgisinden dolayı çok teşekkür ederim sn damla erdemin tatlı süprizi için teşekkür ederiz pizza bağımlısı olduk atakan beye ilgisinden ve ikramından dolayı özellikle teşekkür ediyorum tatlı için teşekkür ederim her zaman olduğu gibi çok lezzetliydi pizza yanında 4. yemek çok güzeldi semih bey harikasınız sıcak geldi ve lezzetliydi atakan bey çok sağolsun çok ilgili birisi i̇yi kenan beye güler yüzünden dolayı teşekkürler bilal beye çok teşekkür ederim böyle devam bam bam bam bam i̇yi seneler hızlı teslimatı güler yüzü ve 5. yemek çok lezzetliydi ve yanında 2 adet ikram sufle geldi teşekkür ederim çok hızlı ve çok lezzetli geldi atakan beye de cok guzeldi harikasiniz bilal beye selamlar her şey çok güzeldi kurye özgür beye de çok güzeldi pizza bulls ailesine teşekkürler semih 6. yemek çok lezzetliydi ve her şey çok güzeldi ve sipariş çok hızlı çok lezzetli ve hızlıydı başarıların devamını dilerim güleryüzlü hizmet sorunsuz servis ve pizzayı yedirten kenar sos unutuluyor artık her seferinde yanında muhakkak bir hediye oluyor bu sefer sufle ikram etmesede 7. yemek çok güzeldi semih beyin özel ilgisi için çok teşekkür ederiz sipariş hem hızlıydı hem de ikram gönderdi şiddetle tavsiye ederim pizzalar sıcak ve hızlı herzaman şaşmayan lezzet mükemmelsiniz ikramınız için teşekkürler mükemmel lezzet mükemmel servis yine bilal beye çok teşekkür ediyoruz 8. yemek çok güzeldi semih bey harikasınız maşallah size bize her zaman siz getirin ve sufle getirin mükemmel hizmet gayet hızlı geldi kuryenize teşekkürler çok iyi çok lezzetli ve hızlı çok hızlı sıcak ve lezzetli ikram icin tesekkurler atakan beye teşekkürler 10 10 9. yemek çok güzeldi semih beye ilgisinden dolayı teşekkür ederim her zamanki gibi cok kibardi bilal beye ikramı için çok teşekkür ederiz yardımcı oldu bilal beye çok teşekkürler pizza bulls ailesi bilal bey kuryeye güler yüzünden dolayı özellikle çok teşekkür ediyorum elinize sağlık 10. yemek çok güzeldi semih beye ilgisinden dolayı teşekkür ederim atakan bey cidden çok mükemmel bir insan işini böylesine severek yapmasına ve iletişimine hayran kaldık pizza işte pizza konusunda gerçekten iyiler şuana kadar gördüğüm hem tadı hem hızı hem de tavırları 10 luk 11. yemek çok lezzetliydi ve kuriyeniz atakan bey çok nazikti bilal beye ikramından dolayı teşekkür ederiz bilal beye de teşekkürler anlayışı için üsküdarda pizza yenilebilecek tek adres teşekkür ederiz mukemmel lezzet tam zamanlama atakan bey i̇lgi ve alaka çok iyiydi fırat beye teşekkürler 12. yemek çok lezzetliydi ve kuriyeniz atakan bey çok nazikti teşekkür ederim gayet memnun kaldım i̇yi çalışmalar good harika çıtır soğanları patatesleri çıtır tavukları enfesti güler yüzlü oluyor her zaman yanında sufle ya da hediye patates getiriyorlar pizza için 18 tl ek ödeme 13. yemek çok lezzetliydi ve sıcak geldi servis gayet güzel ikram için de çok teşekkür ederim 10 senedir buradan pizza söylüyorum ilk defa bu kadar kısa sürede çözüm bulursunuz umarım tatlı için teşekkürler hızlı lezzetli hizmet için ve atakan beye çok teşekkür ederiz 14. yemek çok lezzetliydi ve kuriyeniz atakan bey çok kibar ve işlerini iyi yapıyorlar pizzalar lezzetli bilal beye özellikle teşekkür ederim sucukta olabilirdi icinde sufle hiç lezzetli değildi pizzanın da malzemesi azdı teşekkürlerrr verdiğim her siparişden memnun kalıyorum harun bey e ikramından dolayı 15. yemek çok lezzetliydi ve kurye de geç getirmedi ama nasıl böyle soguk ve tatsız geldi anlamadım büyük boy pizza ve hediyeleri içn çok teşekkür ederiz efsane her konuda oldukça başarılılar her şey için teşekkürler bu işin en iyisi pizza bulls her zaman 16. yemek çok güzeldi semih beye çok teşekkürler gayet güzel gayet lezzetli ve hızlı çok sıcak ve malzeme kalitesi olarak gerçekten iyi tatlı ikramı içinde teşekkür ederim desteklenmeyi sonuna kadar hak ediyorsunuz semih bey e teşekkürler bilal beye kibarlığı ve işine olan saygısından 17. yemek çok lezzetliydi ve kurye de gayet iyiydi bu klasmanda en iyi sipariş 10 numara 5 yıldız 10 numara pizzacı 10 puan semih beye teşekkürler herzaman 10 numara atakan abiye selamlar çok kibardı atakan bey süper atakan beye ilgisinden dolayı teşekkür ederiz 18. yemek çok lezzetliydi ve hızlı ulaştı servisi yapan arkadaş ali beye çok teşekkürler keyifli bir sahur yaptık yardımseverliğinden ve nezaketinden ötürü atakan beye teşekkür ederiz çok zarifsiniz elinize sağlık harun kardesım ikramlar için tesekkurler tarık bey i̇kram sufle de gayet iyi bilal 19. yemek çok lezzetliydi ve kuriyeniz atakan bey çok güler yüzlü hizmet de cabası elinize sağlık dehset pizza da teksiniz müthiş müthiş müthiş ve notlarımı dikkate almaları ayrıca bir teşekkürümüzü iletirsiniz siparişim çok geç geldi ama pizzalar vasattı ne yazık ki hele ki
Convolutional neural networks are used for image classification purposes. For our labels, we are considering if a product name, description, or product category contains the word pizza in it or not.
sql_command = """
SELECT
DISTINCT
FPI."IMAGE_LINK",
FPIS."SOURCE",
CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END AS "IS_PIZZA"
FROM "ODS"."EXT_FB_MENU" EFM
INNER JOIN "ODS"."EXT_FB_PRODUCT_IMAGE" FPI ON (EFM."PRODUCT_ID" = FPI."PRODUCT_ID")
INNER JOIN "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" FPIS ON (FPI."IMAGE_LINK" = FPIS."IMAGE_LINK");
"""
image_source_df = pd.read_sql(sql_command,conn)
image_source_df
| IMAGE_LINK | SOURCE | IS_PIZZA | |
|---|---|---|---|
| 0 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 1 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 2 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 3 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 4 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| ... | ... | ... | ... |
| 4961 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4962 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 4963 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4964 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4965 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
4966 rows × 3 columns
features = []
labels = []
#image_id = []
for i in range(len(image_source_df)):
image_source = byte_image_to_numpy_array(image_source_df.iloc[i]["SOURCE"], 200)
is_pizza = image_source_df.iloc[i]["IS_PIZZA"]
#image_id.append(image_source_df.iloc[i]["IMAGE_LINK"])
features.append(image_source)
labels.append(is_pizza)
#Image.fromarray(features[2381])
features = np.array(features).reshape(-1, 200, 200, 3)
features = features.astype('float32')/255
labels = np_utils.to_categorical(labels, 2)
batch_size = 16
epochs = 3
print("Feature Shape: {f_shape} Label Shape: {l_shape}".format(f_shape=features.shape, l_shape=labels.shape))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu,
input_shape=(200, 200, 3)),
tf.keras.layers.MaxPooling2D((2, 2), strides=2),
tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu),
tf.keras.layers.MaxPooling2D((2, 2), strides=2),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dense(2, activation=tf.nn.softmax)
])
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(features_train, labels_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_data = (features_test, labels_test))
label_prediction = model.predict(features_test)
score = model.evaluate(features_test, labels_test, verbose = 0)
print("""Loss: {loss_score:0.4f} Accuracy: {accuracy:0.2f}%""".format(loss_score=score[0], accuracy=score[1]*100))
Feature Shape: (4966, 200, 200, 3) Label Shape: (4966, 2) Epoch 1/3 218/218 [==============================] - 99s 451ms/step - loss: 0.6135 - accuracy: 0.7629 - val_loss: 0.3277 - val_accuracy: 0.8631 Epoch 2/3 218/218 [==============================] - 97s 447ms/step - loss: 0.3181 - accuracy: 0.8662 - val_loss: 0.2936 - val_accuracy: 0.8799 Epoch 3/3 218/218 [==============================] - 97s 445ms/step - loss: 0.2292 - accuracy: 0.9068 - val_loss: 0.2417 - val_accuracy: 0.9034 Loss: 0.2417 Accuracy: 90.34%
if(not(check_if_file_exists('images/cnn_tp.png') and check_if_file_exists('images/cnn_tn.png') and check_if_file_exists('images/cnn_fp.png') and check_if_file_exists('images/cnn_fn.png'))):
tp, tn, fp, fn = False, False, False, False
#test_row = 456
#test_row = 456
for test_row in range(len(labels_test)):
true_label = labels_test[test_row][1].astype(np.uint8)
prediction = round(label_prediction[test_row][1])
if(true_label == 1 and prediction == 1 and tp == False):
tp = test_row
plt.title('True Positive',fontweight='bold',fontsize=15)
plt.imshow((features_test[tp]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_tp.png')
plt.close()
continue
if(true_label == 0 and prediction == 0 and tn == False):
tn = test_row
plt.title('True Negative',fontweight='bold',fontsize=15)
plt.imshow((features_test[tn]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_tn.png')
plt.close()
continue
if(true_label == 0 and prediction == 1 and fp == False):
fp = test_row
plt.title('False Positive',fontweight='bold',fontsize=15)
plt.imshow((features_test[fp]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_fp.png')
plt.close()
continue
if(true_label == 1 and prediction == 0 and fn == False):
fn = test_row
plt.title('False Negative',fontweight='bold',fontsize=15)
plt.imshow((features_test[fn]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_fn.png')
plt.close()
continue
images = [Image.open(x) for x in ['images/cnn_tp.png', 'images/cnn_tn.png', 'images/cnn_fp.png', 'images/cnn_fn.png']]
widths, heights = zip(*(i.size for i in images))
total_width = sum(widths)
max_height = max(heights)
merged_im = Image.new('RGB', (total_width, max_height))
x_offset = 0
for im in images:
merged_im.paste(im, (x_offset,0))
x_offset += im.size[0]
merged_im